import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mpimg


df = pd.read_csv("../DATA/bank-full.csv")


#based on domain experiance , for marketing purposes we may need to cluster clients to 3 groups :
#took a loan,didn't take a loan ,unknown .
plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df,x='age',hue='loan')

<AxesSubplot:xlabel='age', ylabel='Count'>


# pdays: number of days that passed by after the client was last contacted from a previous campaign
# (numeric; 999 means client was not previously contacted)

plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df[df['pdays']!=999],x='pdays')

<AxesSubplot:xlabel='pdays', ylabel='Count'>


# 1000 is 16 minutes
plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df,x='duration',hue='contact')

<AxesSubplot:xlabel='duration', ylabel='Count'>


plt.figure(figsize=(12,6),dpi=200)

sns.countplot(data=df,x='education',order=df['education'].value_counts().index)
plt.xticks(rotation=90);


X=pd.get_dummies(df)


from sklearn.preprocessing import StandardScaler


scaler= StandardScaler()


# No data leakage as we do'nt have our label 
X=scaler.fit_transform(X)


from sklearn.cluster import KMeans


model=KMeans(n_clusters=2)


# 1- will fit all features to the model to find the cluster centers 
# 2- we will predict the features would belong to which cluster 
# Note if you run the cell more thean once you will get 0,0,0,,...,1,1,1
# the zeros and ones are meaningless they're just labels of the clusters
#as what matters that the same rows belong to the same clusters
cluster_labels=model.fit_predict(X)
cluster_labels

array([1, 1, 1, ..., 0, 0, 0])


df=pd.get_dummies(df)
df['Cluster'] =cluster_labels


plt.figure(figsize=(12,6),dpi=200)
df.corr()['Cluster'].iloc[:-1].sort_values().plot(kind='bar')

<AxesSubplot:>


ssd = []

for k in range(2,10):
    
    model = KMeans(n_clusters=k)
    
    
    model.fit(X)
    
    #Sum of squared distances of samples to their closest cluster center.
    ssd.append(model.inertia_)

ssd

[2469792.3616627543,
 2370786.446603645,
 2271502.8081971155,
 2228290.0533834356,
 2157695.015264023,
 2074338.1385483479,
 2076251.5749846818,
 1995548.640403869]


plt.plot(range(2,10),ssd,'o--')
plt.xlabel("K Value")
plt.ylabel(" Sum of Squared Distances")

Text(0, 0.5, ' Sum of Squared Distances')


# Change in SSD from previous K value!
pd.Series(ssd).diff()

0             NaN
1   -99005.915059
2   -99283.638407
3   -43212.754814
4   -70595.038119
5   -83356.876716
6     1913.436436
7   -80702.934581
dtype: float64


image_as_array=mpimg.imread("Hadeel's_me_time.jpg")


# (R,G,B)
image_as_array

array([[[174, 153, 168],
        [175, 154, 169],
        [175, 154, 169],
        ...,
        [ 95,  96, 101],
        [ 96,  96, 104],
        [ 96,  96, 104]],

       [[175, 157, 171],
        [175, 157, 171],
        [176, 158, 172],
        ...,
        [ 95,  96, 101],
        [ 95,  95, 103],
        [ 95,  95, 103]],

       [[178, 160, 174],
        [179, 161, 175],
        [180, 162, 176],
        ...,
        [ 96,  97, 102],
        [ 95,  95, 103],
        [ 95,  95, 103]],

       ...,

       [[172, 127, 150],
        [172, 127, 150],
        [172, 127, 150],
        ...,
        [201, 225, 225],
        [201, 225, 225],
        [201, 225, 225]],

       [[172, 127, 150],
        [172, 127, 150],
        [172, 127, 150],
        ...,
        [202, 226, 226],
        [200, 224, 224],
        [200, 224, 224]],

       [[172, 127, 150],
        [171, 126, 149],
        [171, 126, 149],
        ...,
        [202, 226, 226],
        [199, 223, 223],
        [199, 223, 223]]], dtype=uint8)


plt.figure(figsize=(6,6),dpi=100)
plt.imshow(image_as_array)

<matplotlib.image.AxesImage at 0x2547155cac8>


# 3d array => 1280 width , 1162 height , 3 channels

# First Dimension (Height - h) → Represents the number of rows of pixels.

# Second Dimension (Width - w) → Represents the number of columns of pixels.

# Third Dimension (Channels - c) → Represents color information (typically 3 for RGB images).

(h,w,rgb)=image_as_array.shape
(h,w,rgb)

(1280, 1162, 3)


# 2D array => height*width , 3 channels 
#Rows → Each row corresponds to a single pixel in the original image
#Columns → Each column stores the RGB values for that pixel.
images_as_2d_array = image_as_array.reshape(h*w,rgb)


images_as_2d_array.shape

(1487360, 3)


from sklearn.cluster import KMeans


model= KMeans(n_clusters=20)
label=model.fit_predict(images_as_2d_array)


# 10 colors with rgb code
model.cluster_centers_

array([[179.08497623, 156.92511884, 144.75479109],
       [219.02596897, 221.15505002, 214.22445037],
       [104.81497285,  91.95223318,  83.54635376],
       [189.74249677, 172.71689254, 165.89847001],
       [181.18202613, 216.36212161, 228.68702586],
       [227.59565859, 220.00657499, 160.73084914],
       [167.62981414, 122.98687488, 145.0799323 ],
       [206.66526934, 207.06678184, 199.68490644],
       [217.50141054, 208.61443746, 148.80899082],
       [208.69399195, 153.46626657, 103.00016488],
       [ 39.04528227,  37.31329971,  36.27710809],
       [136.46617339, 111.22182378, 107.30387131],
       [132.0054727 , 153.92413463, 167.36051443],
       [185.50186031, 145.66167307, 164.82556187],
       [159.0311138 , 195.61132216, 213.05045797],
       [194.92928672, 229.25751486, 239.88826036],
       [159.31620303, 139.733546  , 122.44314397],
       [ 72.52850016,  69.40758237,  67.08604276],
       [194.91417204, 193.64641021, 189.2989296 ],
       [180.01698129, 191.71842945, 138.31592104]])


label

array([13, 13, 13, ..., 15, 15, 15])


rgb_codes = model.cluster_centers_.round(0).astype(int)
quantized_image = np.reshape(rgb_codes[label], (h, w, rgb))


plt.figure(figsize=(6,6),dpi=100)
plt.imshow(quantized_image)

<matplotlib.image.AxesImage at 0x254715d9748>

bank client data:¶

Exploratory Data Analysis¶

Trainning The Model¶

Determining K value using elbow method¶

--------------------------------------------------------------------------------------------------------------¶

Color Quantization¶

Convert from 3d to 2d Array¶

Trainning K mean clustering¶